######################
#####            #####
##### 1.0 Set Up #####
#####            #####
######################

##### 1.1 Cleaning the Environment
rm(list = ls())
gc()

##### 1.2 Packages
library(tidyverse)
library(brms)
library(parallel)
library(rstan)
library(stargazer)
library(sandwich)
library(lmtest)
library(survival)
library(survminer)
library(gridExtra)
options(mc.cores = parallel::detectCores())

##### 1.3 Setting Working Directory
setwd("A:/Research/Measuring Workforce Capacity")


##### 1.4 Functions
distplots <- function(x){
plotdata <- data[,x]
colnames(plotdata)[1] <- "var"
    ggplot(plotdata, aes(x=var)) +
              geom_density()+
              geom_vline(aes(xintercept=mean(var,na.rm=TRUE)))
}

`%notin%` <- Negate(`%in%`)

normalize_sample <- function(i,df,mean,sd){
row <- as.numeric(df[i,])
norm <- (row-mean)/sd
return(norm)
}

#############################
#####                   #####
##### 2.0 Cleaning Data #####
#####                   #####
#############################

##### PARTS 2.0 AND 3.0 BELONG TO THE SAME FUNCTION. RUN ESTIMATES IN PART 4.0
##### For running in BRMS, the Data needs to be in long-format. It should contain the following:
# 1. resp: The value of the agency-year for a particular item (i.e. variable of the fedscope data).
# 2. rater: An item in the fedscope dataset.
# 3. item: An agency-year combination.
Capacity_IRT <- function(items,filename){
  
data <- read_csv("ehri_data_oews.csv") %>%
        mutate(POLICYEMPLOYEE_GENRESEARCHINCLUDED_RAWNUM_LOG = ifelse(POLICYEMPLOYEE_GENRESEARCHINCLUDED_RAWNUM_LOG == -Inf,
                                                                      NA,
                                                                      as.numeric(POLICYEMPLOYEE_GENRESEARCHINCLUDED_RAWNUM_LOG)),
               POLICYEMPLOYEE_DC_GENRESEARCHINCLUDED_RAWNUM_LOG = ifelse(POLICYEMPLOYEE_DC_GENRESEARCHINCLUDED_RAWNUM_LOG == -Inf,
                                                                      NA,
                                                                      as.numeric(POLICYEMPLOYEE_DC_GENRESEARCHINCLUDED_RAWNUM_LOG)),
               POLICYEMPLOYEE_GENRESEARCHEXCLUDED_RAWNUM_LOG = ifelse(POLICYEMPLOYEE_GENRESEARCHEXCLUDED_RAWNUM_LOG == -Inf,
                                                                      NA,
                                                                      as.numeric(POLICYEMPLOYEE_GENRESEARCHEXCLUDED_RAWNUM_LOG))) %>%
        mutate(POLICYEMPLOYEE_GENRESEARCHINCLUDED_RAWNUM_LOG = as.numeric(POLICYEMPLOYEE_GENRESEARCHINCLUDED_RAWNUM_LOG),
               POLICYEMPLOYEE_DC_GENRESEARCHINCLUDED_RAWNUM_LOG = as.numeric(POLICYEMPLOYEE_GENRESEARCHINCLUDED_RAWNUM_LOG),
               POLICYEMPLOYEE_GENRESEARCHEXCLUDED_RAWNUM_LOG = as.numeric(POLICYEMPLOYEE_GENRESEARCHEXCLUDED_RAWNUM_LOG),
               POLICYEMPLOYEE_GENRESEARCHINCLUDED_INDUSTRY_SALARY_MEAN = as.numeric(POLICYEMPLOYEE_GENRESEARCHINCLUDED_INDUSTRY_SALARY_MEAN),
               POLICYEMPLOYEE_GENRESEARCHINCLUDED_COLLEGE_PROP = as.numeric(POLICYEMPLOYEE_GENRESEARCHINCLUDED_COLLEGE_PROP))
   
  
##### 2.1 Specify Item Set
total_itemset <- items

##### 2.1 Basic Cleaning Activities
data <- data %>%

#### 2.1.1 Creating an id variable
        mutate(ID = paste(AGYSUB, DATECODE, sep = "-")) %>% 

#### 2.1.2 Subsetting to September Observations
        filter(str_detect(DATECODE, "09$") == TRUE) %>%
        filter(DATECODE != 199809) %>%
        filter(DATECODE != 199909) %>%
        filter(DATECODE != 200009)

#### 2.1.3 Adjusting Salary Data for Inflation
data$DATECODE <- as.numeric(data$DATECODE)
inflation_data <- data.frame(RATE = c(1.588, 1.553, 1.503, 1.462, 1.439, 1.407,
                                      1.370, 1.325, 1.285, 1.248, 1.202, 1.206,
                                      1.187, 1.151, 1.127, 1.111, 1.093, 1.092, 
                                      1.078, 1.056, 1.031, 1.012, 1, 0.95),
                             DATECODE = c(199809, 199909, 200009, 200109, 200209,
                                          200309, 200409, 200509, 200609, 200709,
                                          200809, 200909, 201009, 201109, 201209,
                                          201309, 201409, 201509, 201609, 201709,
                                          201809, 201909, 202009, 202109))
for(i in 1:nrow(inflation_data)){
  year <- inflation_data$DATECODE[i]
  rate <- inflation_data$RATE[i]
  data$POLICYEMPLOYEE_GENRESEARCHINCLUDED_SALARY_MEAN[data$DATECODE == year] <- (data$POLICYEMPLOYEE_GENRESEARCHINCLUDED_SALARY_MEAN[data$DATECODE == year]*rate)/1000
  data$POLICYEMPLOYEE_GENRESEARCHINCLUDED_INDUSTRY_SALARY_MEAN[data$DATECODE == year] <- (data$POLICYEMPLOYEE_GENRESEARCHINCLUDED_INDUSTRY_SALARY_MEAN[data$DATECODE == year]*rate)/1000
  data$POLICYEMPLOYEE_DC_GENRESEARCHINCLUDED_SALARY_MEAN[data$DATECODE == year] <- (data$POLICYEMPLOYEE_DC_GENRESEARCHINCLUDED_SALARY_MEAN[data$DATECODE == year]*rate)/1000
  data$POLICYEMPLOYEE_GENRESEARCHEXCLUDED_SALARY_MEAN[data$DATECODE == year] <- (data$POLICYEMPLOYEE_GENRESEARCHEXCLUDED_SALARY_MEAN[data$DATECODE == year]*rate)/1000
  rm(year,rate)
}  
rm(inflation_data)

#### 2.1.4 Subsetting to agencies in agency list and those with more than fifteen employees
agylist <- read.csv("AGENCY_LIST.csv")
colnames(agylist)[1] <- "ABBR" 
agylist <- agylist %>%
           pivot_longer(cols = 5:length(agylist),
                        names_to = "DATE",
                        values_to = "FEDSCOPE_ID") %>%
           mutate(DATE = str_remove_all(DATE, "X")) %>%
           mutate(DATE = str_remove_all(DATE, " "),
                  FEDSCOPE_ID = str_remove_all(FEDSCOPE_ID," ")) %>%
           filter(FEDSCOPE_ID != "")
  
 
data <- data %>%
        filter(AGYSUB %in% agylist$FEDSCOPE_ID) %>%
        group_by(AGYSUB) %>%
        mutate(MEAN_EMPLOYMENT = mean(TOTAL_RAWNUM),
               MEAN_POLICYMAKER = mean(POLICYEMPLOYEE_GENRESEARCHINCLUDED_RAWNUM,na.rm = T)) %>%
        ungroup() 

data <- data %>%
        filter(MEAN_EMPLOYMENT > 15) %>%
        filter(POLICYEMPLOYEE_GENRESEARCHINCLUDED_RAWNUM > 0) %>%
        filter(MEAN_POLICYMAKER >= 5) %>%
        mutate(POLICYEMPLOYEE_GENRESEARCHINCLUDED_SALARYDIFF_MEAN = POLICYEMPLOYEE_GENRESEARCHINCLUDED_SALARY_MEAN-POLICYEMPLOYEE_GENRESEARCHINCLUDED_INDUSTRY_SALARY_MEAN) %>%
        mutate(POLICYEMPLOYEE_GENRESEARCHINCLUDED_SALARYDIFF_MEAN = ifelse(is.na(POLICYEMPLOYEE_GENRESEARCHINCLUDED_SALARYDIFF_MEAN) == T,0, POLICYEMPLOYEE_GENRESEARCHINCLUDED_SALARYDIFF_MEAN))


#### 2.1.5 Creating a Set of IDS
agy_ids <- data %>% select(ID, AGYSUB, AGYSUBT, DATECODE)
agy_ids$AGY_ID_NUM <- seq(1, length(unique(agy_ids$ID)))

#### 2.1.6 Selecting Only Needed Variables
total_set <- data %>%
             select(ID, all_of(total_itemset)) 



##### 2.2 Scaling Variables

for(i in 2:length(total_set)){
total_set[,i] <- scale(total_set[,i])
}


##### 2.3 Converting Data to Long Format
total_set <- total_set %>%
             pivot_longer(cols = !ID,
                          names_to = "rater",
                          values_to = "rating")

rater_ids <- data.frame(rater = unique(total_set$rater),
                        RATER_ID_NUM = seq(1, length(unique(total_set$rater))))

##### 2.4 Joining Agency IDs
total_set <- total_set %>%
             left_join(agy_ids) %>% 
             left_join(rater_ids) %>%
             filter(is.na(rating) == FALSE) %>%
             distinct()
     
rm(data,agylist,rater_ids) #Agy_List is reloaded for formatting but removed here to save memory

#####################################
##### 3.0 Specifying STAN Model #####
#####################################

##### 3.1 Prepping Data for STAN

J <- length(unique(total_set$ID)) ## Total Number of Agency--Years
K <- length(unique(total_set$rater)) ## Total Number of Indicators
N <- nrow(total_set)
jj <- total_set$AGY_ID_NUM
kk <- total_set$RATER_ID_NUM
y <- total_set$rating
rm(total_set)
##### 3.2 Running the Model in Stan

model_fit <- stan(file = "model.stan", 
                  data = c("J", "K", "N", "jj", "kk", "y"),
                  iter = 10000,
                  warmup = 5000,
                  chains = 4,
                  cores = 4,
                  seed = "1989")

rm(jj,kk,y)

##### 3.4 Model Diagnostic

#### 3.4.1 Divergent transitions (Should be 0)
check_divergences(model_fit)

#### 3.4.2 Check that Rhat is 1 for all parameters and the Eff Sample
print(model_fit)

#### 3.4.3 Check Parameters
param_sum <- as.data.frame(summary(model_fit))
param_sum <- param_sum[(J+1):nrow(param_sum),1:10]
write.csv(param_sum, paste(filename, "_params.csv", sep =""))
rm(param_sum)
##### 3.5 Normalizing 

#### 3.5.1 Extracting the Sample
samples <- as.matrix(model_fit)

#### 3.5.2 Removing Columns Unrelated to Measures

samples <-as.matrix(samples[,1:J])
samp.mean<-mean(samples) #Storing Sample Mean
samp.sd<-sd(samples) #Storing Sample SD

#### 3.5.3 Creating a New Matrix for Normalized Results
samp.local<-matrix(NA,nrow=nrow(samples),ncol=ncol(samples))

colnames(samp.local)<-colnames(samples)

#### 3.5.4 Normalizing Data Row By Row

for(i in 1:nrow(samples)){
    samp.local[i,]<-(samples[i,]-samp.mean)/samp.sd
}
rm(samples,samp.mean,samp.sd)
#### 3.5.5 Creating a New Matrix For Summary Results
scores.sum<-matrix(NA,nrow=J,ncol=5)
rownames(scores.sum)<-agy_ids$ID
scores.sum[,5]<-rownames(scores.sum)


for(i in 1:nrow(agy_ids)){
  scores.sum[i,1]<-mean(samp.local[,i])
  scores.sum[i,2]<-sd(samp.local[,i])  
  scores.sum[i,3]<-quantile(samp.local[,i],probs=c(0.025))
  scores.sum[i,4]<-quantile(samp.local[,i],probs=c(0.975))
}
colnames(scores.sum)<-c("Capacity_Score","Capacity_Score_SD","Capacity_Score_LB","Capacity_Score_UB","ID")
scores.sum <- as.data.frame(scores.sum)
scores.sum <- scores.sum %>%
              mutate(Capacity_Score = as.numeric(Capacity_Score),
                     Capacity_Score_SD = as.numeric(Capacity_Score_SD),
                     Capacity_Score_LB = as.numeric(Capacity_Score_LB),
                     Capacity_Score_UB = as.numeric(Capacity_Score_UB)) %>%
              left_join(agy_ids)

##### 3.6 Formating and Saving Scores

#### 3.6.1 Loading Agency List
agylist <- read.csv("AGENCY_LIST.csv")
colnames(agylist)[1] <- "ABBR" 
agylist <- agylist %>%
  pivot_longer(cols = 5:length(agylist),
               names_to = "DATE",
               values_to = "FEDSCOPE_ID") %>%
  mutate(DATE = str_remove_all(DATE, "X")) %>%
  mutate(DATE = str_remove_all(DATE, " "),
         FEDSCOPE_ID = str_remove_all(FEDSCOPE_ID," ")) %>%
  filter(FEDSCOPE_ID != "")

#### 3.6.1 Renaming 
scores.sum <- scores.sum %>%
              rename(FEDSCOPE_ID = AGYSUB,
                     DATE = DATECODE,
                     UNIQUE_ID = ID,
                     CAPACITY_SCORE = Capacity_Score,
                     CAPACITY_SD = Capacity_Score_SD,
                     CAPACITY_LOWERBOUND = Capacity_Score_LB,
                     CAPACITY_UPPERBOUND = Capacity_Score_UB) %>%
              select(FEDSCOPE_ID, DATE, UNIQUE_ID, CAPACITY_SCORE, CAPACITY_SD, CAPACITY_LOWERBOUND, CAPACITY_UPPERBOUND)

rows_premerge <- nrow(scores.sum)

#### 3.6.2 Merging 

scores.sum <- scores.sum %>% 
              mutate(DATE = as.character(DATE)) %>%
              left_join(agylist) %>%
              select(UNIQUE_ID, ABBR, AGENCY, BUREAU, AKA,  FEDSCOPE_ID, DATE, CAPACITY_SCORE, CAPACITY_SD, CAPACITY_LOWERBOUND, CAPACITY_UPPERBOUND)

rows_postmerge <- nrow(scores.sum)

rows_postmerge - rows_premerge #Check whether the merge created new rows on accident. Should = 0.
sum(is.na(scores.sum$ABBR)) #Check whether any rows are missing an Abbreviation.



#### 3.6.3 Saving Scores

write.csv(scores.sum, file=paste(filename, "_ests.csv", sep =""))
}

##############################
#####                    #####
##### 4.0 RUNNING MODELS #####
#####                    #####
##############################

#### Model 017
items017 <- c("POLICYEMPLOYEE_GENRESEARCHINCLUDED_RAWNUM_LOG",
              "POLICYEMPLOYEE_GENRESEARCHINCLUDED_PROP",
              "POLICYEMPLOYEE_GENRESEARCHINCLUDED_LOS_MEAN",
              "POLICYEMPLOYEE_GENRESEARCHINCLUDED_SALARYDIFF_MEAN",
              "POLICYEMPLOYEE_GENRESEARCHINCLUDED_COLLEGE_PROP")
name017 <- "policymaking_scores_m017"
Capacity_IRT(items017,name017)
rm(items017,name017)